library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.21.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)
First, we import the fastq files containing the raw reads. The samples were downloaded from the SRA database with the accession number PRJEB37924. Only sigmoid biopsy samples that were analyzed by 16S rRNA sequencing were downloaded (see 00_Metadata-Mars).
# Save the path to the directory containing the fastq zipped files
path <- "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020"
# list.files(path) # check we are in the right directory
# fastq filenames have format: SAMPLENAME.fastq.gz
# Saves the whole directory path to each file name
fnFs <- sort(list.files(file.path(path, "original"), pattern="_1.fastq.gz", full.names = TRUE)) # FWD reads
fnRs <- sort(list.files(file.path(path, "original"), pattern="_2.fastq.gz", full.names = TRUE)) # REV reads
show(fnFs[1:5])
show(fnRs[1:5])
# Extract sample names, assuming filenames have format: SAMPLENAME.fastq.gz
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
show(sample.names[1:5]) # saves only the file name (without the path)
# Look at quality of all files
for (i in 1:3){ # 1:length(fnFs)
show(plotQualityProfile(fnFs[i]))
show(plotQualityProfile(fnRs[i]))
}
# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
# 'reads' = fastqq(fnFs)@nReads)
# min(raw_stats$reads)
# max(raw_stats$reads)
# mean(raw_stats$reads)
We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly (i.e. all reads having the same sequence).
# Look at per base sequence content (forward read)
fseqF1 <- seqTools::fastqq(fnFs[10])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020/original/ERR5169592_1.fastq.gz' done.
rcF1 <- read_content(fseqF1)
plot_read_content(rcF1) + labs(title = "Per base sequence content - Forward read")
fseqR1 <- seqTools::fastqq(fnRs[10])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020/original/ERR5169592_2.fastq.gz' done.
rcR1 <- read_content(fseqR1)
plot_read_content(rcR1) + labs(title = "Per base sequence content - Reverse read")
Now, we will look whether the reads still contain the primers. The methods section indicates that the V4 region of the 16S rRNA gene was amplified as described in the paper by . Primer sequences are shared in that paper.
# V4
FWD <- "GTGCCAGCMGCCGCGGTAA" # F515 forward primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # R806 reverse primer sequence
# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString))
}
# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # F515
REV.orients <- allOrients(REV) # R806
FWD.orients # sanity check
## Forward Complement Reverse
## "GTGCCAGCMGCCGCGGTAA" "CACGGTCGKCGGCGCCATT" "AATGGCGCCGMCGACCGTG"
## RevComp
## "TTACCGCGGCKGCTGGCAC"
REV.orients
## Forward Complement Reverse
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG"
## RevComp
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
return(sum(nhits > 0))
}
# Get a table to know how many times the U515 and E786 primers are found in the reads of each sample
for (i in 1:3){
cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
x <- rbind(ForwardRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnFs[[i]]),
ForwardRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnFs[[i]]),
ReverseRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnRs[[i]]),
ReverseRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnRs[[i]]))
print(x)
cat("\n____________________________________________\n\n")
}
## SAMPLE ERR5169583 with total number of 105593 reads
##
## Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer 102879 0 0 4
## ForwardRead.REVPrimer 4 0 0 96739
## ReverseRead.FWDPrimer 4 0 0 83295
## ReverseRead.REVPrimer 103571 0 0 3
##
## ____________________________________________
##
## SAMPLE ERR5169584 with total number of 129697 reads
##
## Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer 125832 0 0 13
## ForwardRead.REVPrimer 13 0 0 118601
## ReverseRead.FWDPrimer 13 0 0 103070
## ReverseRead.REVPrimer 127174 0 0 13
##
## ____________________________________________
##
## SAMPLE ERR5169585 with total number of 70485 reads
##
## Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer 68860 0 0 1
## ForwardRead.REVPrimer 1 0 0 64299
## ReverseRead.FWDPrimer 1 0 0 55897
## ReverseRead.REVPrimer 69315 0 0 1
##
## ____________________________________________
Almost all reads contain the forward & reverse primers. Ideally, to follow the standardized pipeline, we should (1) filter out reads not containing any primer and (2) trim the primers. However, the fastq files deposited on the SRA/ENA databases do not contain Illumina headers. Thus, during the merging step of paired reads later on, the Dada2 package won’t be able to match paired-reads.
We will simply perform a quality filtering of the reads, and trim the first 20 and last 30 bases of the reads (containing the primers)
Then, we perform a quality filtering of the reads.
# Place filtered files in a filtered/ subdirectory
FWD.filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_F_filt2.fastq.gz")) # FWD reads
REV.filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_R_filt2.fastq.gz")) # REV reads
# Assign names for the filtered fastq.gz files
names(FWD.filt2_samples) <- sample.names
names(REV.filt2_samples) <- sample.names
# Filter
out2 <- filterAndTrim(fwd = fnFs, filt = FWD.filt2_samples,
rev = fnRs, filt.rev = REV.filt2_samples,
trimLeft = 20, trimRight=30, # trim primers
maxEE=3, # reads with more than 3 expected errors (sum(10e(-Q/10))) are discarded
truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
minLen=150, # Discard reads shorter than 150 bp.
compress=TRUE,
multithread = TRUE,
verbose=TRUE)
Let’s look at the output filtered fastq files as sanity check.
out2[1:6,] # show how many reads were filtered in each file
## reads.in reads.out
## ERR5169583_1.fastq.gz 105593 67882
## ERR5169584_1.fastq.gz 129697 84891
## ERR5169585_1.fastq.gz 70485 46101
## ERR5169586_1.fastq.gz 86720 56496
## ERR5169587_1.fastq.gz 88688 58512
## ERR5169588_1.fastq.gz 44429 21535
# Look at quality profile of all filtered files
for (i in 1:3){
show(plotQualityProfile(FWD.filt2_samples[i]))
show(plotQualityProfile(REV.filt2_samples[i]))
}
Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.
errF <- learnErrors(FWD.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)
errR <- learnErrors(REV.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)
The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected.
plotErrors(errF, nominalQ = TRUE) # Forward reads
plotErrors(errR, nominalQ = TRUE) # Reverse reads
The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.
# Dereplicate the reads in the sample
derepF <- derepFastq(FWD.filt2_samples) # forward
derepR <- derepFastq(REV.filt2_samples) # reverse
# Infer sequence variants
dadaFs <- dada(derepF, err=errF, multithread=TRUE) # forward
dadaRs <- dada(derepR, err=errR, multithread=TRUE) # reverse
# Inspect the infered sequence variants
for (i in 1:3){
print(dadaFs[[i]])
print(dadaRs[[i]])
print("________________")
}
## dada-class: object describing DADA2 denoising results
## 217 sequence variants were inferred from 13852 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 134 sequence variants were inferred from 16309 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 226 sequence variants were inferred from 16473 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 125 sequence variants were inferred from 19321 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 188 sequence variants were inferred from 9899 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 111 sequence variants were inferred from 11937 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
We now need to merge paired reads.
mergers <- mergePairs(dadaFs, derepF, dadaRs, derepR, verbose=TRUE)
head(mergers[[1]])
We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.
# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(mergers)
# We should have 72 samples (72 rows)
dim(seqtable)
## [1] 72 3733
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")
# Sequences should be between 515F - 806R, so around ~250bp (removing the primers lengths).
# Remove any sequence variant below outside 200-300bp
seqtable.new <- seqtable[,nchar(colnames(seqtable)) %in% 200:300]
dim(seqtable.new)
## [1] 72 1738
hist(nchar(getSequences(seqtable.new)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")
The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.
seqtable.nochim <- removeBimeraDenovo(seqtable.new, method="consensus", multithread=TRUE, verbose=TRUE)
# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1] 72 1678
# Keep only ASVs that are present in at least 2 samples
seqtable.nochim <- seqtable.nochim[, colSums(seqtable.nochim > 0) >=2]
dim(seqtable.nochim)
## [1] 72 827
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable.new)
## [1] 0.9637866
Sanity check before assigning taxonomy.
# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))
# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
track <- cbind(out2,
sapply(dadaFs, getN),
sapply(dadaRs, getN),
sapply(mergers, getN),
rowSums(seqtable.nochim),
lapply(rowSums(seqtable.nochim)*100/out2[,1], as.integer))
# Assign column and row names
colnames(track) <- c("input", "quality-filt", "denoisedF", "denoisedR", 'merged', 'nonchim', "%input->output")
rownames(track) <- sample.names
# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track
## input quality-filt denoisedF denoisedR merged nonchim
## ERR5169583 105593 67882 67514 67468 64359 45932
## ERR5169584 129697 84891 84386 84242 81354 62986
## ERR5169585 70485 46101 45855 45742 44299 35776
## ERR5169586 86720 56496 56148 56038 53806 45383
## ERR5169587 88688 58512 58086 58008 55905 38513
## ERR5169588 44429 21535 21249 21253 20536 15130
## ERR5169589 97536 61839 61359 61297 60215 23782
## ERR5169590 113298 73704 73246 73047 69120 44772
## ERR5169591 70662 45311 44986 44815 41039 38649
## ERR5169592 84848 53043 52654 52515 50040 35300
## ERR5169593 70070 45741 45278 45200 43624 33305
## ERR5169594 70514 46313 46066 45968 44669 28068
## ERR5169595 81234 51884 51578 51518 49541 42853
## ERR5169596 62654 39902 39785 39663 37516 35209
## ERR5169597 123998 82637 82288 82277 81354 46665
## ERR5169598 105876 71672 71412 71414 70545 26610
## ERR5169599 105644 72969 72674 72648 72158 5822
## ERR5169600 79270 50487 50177 50177 47739 45376
## ERR5169601 76041 47762 47346 47320 45669 36877
## ERR5169602 87122 59028 58789 58775 58113 3823
## ERR5169603 73742 48317 48114 48084 47010 37102
## ERR5169604 74329 46552 46278 46151 44192 39795
## ERR5169605 55292 36072 35901 35734 34195 29688
## ERR5169606 101878 66803 66575 66609 64616 61324
## ERR5169607 83029 52595 52324 52211 50107 39044
## ERR5169608 82800 56168 55822 55799 54485 18543
## ERR5169609 73407 48392 48111 48109 45666 42398
## ERR5169610 69347 47775 47518 47440 46274 19669
## ERR5169611 70806 45827 45462 45284 43149 36600
## ERR5169612 77124 52468 52161 52163 51271 18750
## ERR5169613 68381 43427 43265 43233 41903 38396
## ERR5169614 74287 46555 46216 46183 44623 34923
## ERR5169615 64188 41889 41656 41529 39654 34782
## ERR5169616 63924 40179 40011 39896 37615 30001
## ERR5169617 91806 63077 62795 62710 61169 19089
## ERR5169618 89209 60319 60007 60032 57906 27549
## ERR5169619 1633 701 680 633 562 447
## ERR5169620 72249 44767 44512 44430 42735 26880
## ERR5169621 86114 55549 55331 55240 52425 34289
## ERR5169622 52406 34619 34458 34469 32990 27632
## ERR5169623 97958 61968 61563 61422 59478 38696
## ERR5169624 65330 41531 41278 41111 38971 34505
## ERR5169625 55228 34342 34109 34007 32495 25990
## ERR5169626 75144 45993 45734 45536 42554 39371
## ERR5169627 65796 41340 41062 40964 38875 33484
## ERR5169628 67956 41265 40873 40627 37816 29111
## ERR5169629 72891 47614 47288 47245 45526 34550
## ERR5169630 83538 54506 54174 54242 51372 45837
## ERR5169631 67169 42595 42353 42336 41079 32009
## ERR5169632 53835 34606 34419 34322 32417 29395
## ERR5169633 52155 34003 33830 33732 32101 23546
## ERR5169634 76435 48921 48598 48472 45800 35697
## ERR5169635 53442 34376 34049 33930 31764 26694
## ERR5169636 41211 26655 26514 26453 25099 21816
## ERR5169637 46747 29274 29102 29119 27417 21194
## ERR5169638 76795 46229 45999 45990 45442 32260
## ERR5169639 71235 42595 42413 42382 40974 29714
## ERR5169640 39718 23558 23388 23360 22604 15244
## ERR5169641 68115 44439 44340 44329 43235 35316
## ERR5169642 17042 11307 11165 11140 11037 361
## ERR5169643 26208 16351 16162 16060 15450 8082
## ERR5169644 64295 40761 40620 40574 39237 32417
## ERR5169645 50906 30554 30265 30319 28395 25414
## ERR5169646 73257 47766 47571 47625 46251 30310
## ERR5169647 8075 5306 5216 5196 5124 198
## ERR5169648 7421 4659 4464 4441 4124 1112
## ERR5169649 47727 31004 30826 30859 28775 27240
## ERR5169650 75469 49539 49283 49101 46863 43176
## ERR5169651 17381 10780 10699 10526 9696 5151
## ERR5169652 14220 8886 8816 8677 8159 7466
## ERR5169653 49556 32426 32132 32058 30825 25537
## ERR5169654 83283 56457 55932 55870 54652 22196
## %input->output
## ERR5169583 43
## ERR5169584 48
## ERR5169585 50
## ERR5169586 52
## ERR5169587 43
## ERR5169588 34
## ERR5169589 24
## ERR5169590 39
## ERR5169591 54
## ERR5169592 41
## ERR5169593 47
## ERR5169594 39
## ERR5169595 52
## ERR5169596 56
## ERR5169597 37
## ERR5169598 25
## ERR5169599 5
## ERR5169600 57
## ERR5169601 48
## ERR5169602 4
## ERR5169603 50
## ERR5169604 53
## ERR5169605 53
## ERR5169606 60
## ERR5169607 47
## ERR5169608 22
## ERR5169609 57
## ERR5169610 28
## ERR5169611 51
## ERR5169612 24
## ERR5169613 56
## ERR5169614 47
## ERR5169615 54
## ERR5169616 46
## ERR5169617 20
## ERR5169618 30
## ERR5169619 27
## ERR5169620 37
## ERR5169621 39
## ERR5169622 52
## ERR5169623 39
## ERR5169624 52
## ERR5169625 47
## ERR5169626 52
## ERR5169627 50
## ERR5169628 42
## ERR5169629 47
## ERR5169630 54
## ERR5169631 47
## ERR5169632 54
## ERR5169633 45
## ERR5169634 46
## ERR5169635 49
## ERR5169636 52
## ERR5169637 45
## ERR5169638 42
## ERR5169639 41
## ERR5169640 38
## ERR5169641 51
## ERR5169642 2
## ERR5169643 30
## ERR5169644 50
## ERR5169645 49
## ERR5169646 41
## ERR5169647 2
## ERR5169648 14
## ERR5169649 57
## ERR5169650 57
## ERR5169651 29
## ERR5169652 52
## ERR5169653 51
## ERR5169654 26
Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.
# Assign taxonomy (with silva v138)
taxa <- assignTaxonomy(seqtable.nochim, "~/Projects/IBS_Meta-analysis_16S/data/silva_nr_v138_train_set.fa.gz",
tryRC = TRUE, # try reverse complement of the sequences
multithread=TRUE, verbose = TRUE)
# Add species assignment
taxa <- addSpecies(taxa, "~/Projects/IBS_Meta-analysis_16S/data/silva_species_assignment_v138.fa.gz")
# Check how the taxonomy table looks like
taxa.print <- taxa
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
## Kingdom Phylum Class Order
## [1,] "Bacteria" "Firmicutes" "Clostridia" "Lachnospirales"
## [2,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"
## [3,] "Bacteria" "Firmicutes" "Clostridia" "Lachnospirales"
## [4,] "Bacteria" "Firmicutes" "Clostridia" "Oscillospirales"
## [5,] "Bacteria" "Firmicutes" "Clostridia" "Oscillospirales"
## [6,] "Bacteria" "Firmicutes" "Clostridia" "Lachnospirales"
## Family Genus Species
## [1,] "Lachnospiraceae" "Blautia" NA
## [2,] "Bacteroidaceae" "Bacteroides" "vulgatus"
## [3,] "Lachnospiraceae" "Agathobacter" NA
## [4,] "Ruminococcaceae" "Faecalibacterium" NA
## [5,] "Ruminococcaceae" "Faecalibacterium" "prausnitzii"
## [6,] "Lachnospiraceae" "Fusicatenibacter" "saccharivorans"
table(taxa.print[,1]) # Show the different kingdoms (should be only bacteria)
##
## Archaea Bacteria Eukaryota
## 2 822 3
taxa.print[taxa.print[,1] == 'Eukaryota',2] # the eukaryota are all NAs
## [1] NA NA NA
table(taxa.print[,2]) # Show the different phyla
##
## Actinobacteriota Bacteroidota Campilobacterota Cyanobacteria
## 42 120 3 7
## Desulfobacterota Euryarchaeota Firmicutes Fusobacteriota
## 5 1 590 5
## Patescibacteria Proteobacteria Spirochaetota Thermoplasmatota
## 4 41 1 1
## Verrucomicrobiota
## 4
table(is.na(taxa.print[,2])) # is there any NA phyla?
##
## FALSE TRUE
## 824 3
We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.
The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.
#_________________________
# Import metadata
metadata_table <- read.csv(file.path(path, "00_Metadata-Mars/modif_metadata(R).csv")) %>% select(-X)
colnames(metadata_table)
rownames(metadata_table) <- metadata_table$Run
# Add a few covariates
metadata_table$author <- "Mars"
metadata_table$sequencing_tech <- "Illumina"
metadata_table$variable_region <- "V4"
# Remove _F_filt2.fastq.gz from rownames
# rownames(seqtable.nochim)[1:5]
rownames(seqtable.nochim) <- sub("_.*", "", rownames(seqtable.nochim))
#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
sample_data(metadata_table),
tax_table(taxa))
# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, Kingdom != "Eukaryota")
physeq <- subset_taxa(physeq, !is.na(Phylum))
# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)
# Save to disk
saveRDS(raw_stats, file.path(path, "01_Dada2-Mars/raw_stats.rds"))
saveRDS(out2, file.path(path, "01_Dada2-Mars/out2.rds"))
saveRDS(errF, file.path(path, "01_Dada2-Mars/errF.rds"))
saveRDS(errR, file.path(path, "01_Dada2-Mars/errR.rds"))
saveRDS(dadaFs, file.path(path, "01_Dada2-Mars/infered_seq_F.rds"))
saveRDS(dadaRs, file.path(path, "01_Dada2-Mars/infered_seq_R.rds"))
saveRDS(mergers, file.path(path, "01_Dada2-Mars/mergers.rds"))
saveRDS(otu_table(physeq), file.path(path, "01_Dada2-Mars/ASVtable_final.rds")) # ASV table
saveRDS(tax_table(physeq), file.path(path, "01_Dada2-Mars/taxa_final.rds")) # taxa table
saveRDS(sample_data(physeq), file.path(path, "01_Dada2-Mars/metadata_final.rds")) # metadata table
# Taxa & Phyloseq object
saveRDS(taxa, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/Taxonomy/output/taxa_mars.rds")
saveRDS(physeq, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/PhyloTree/input/physeq_mars.rds")
# Absolute abundance
plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free_x") + theme(axis.text.x = element_blank())
# Relative abundance for Phylum
phylum.table <- physeq %>%
tax_glom(taxrank = "Phylum") %>% # agglomerate at phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() # Melt to long format
ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(size = 5, angle = -90))+
labs(x = "Samples", y = "Relative abundance")